{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "\n",
    "cats = ['alt.atheism', 'sci.space']\n",
    "newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)\n",
    "newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = newsgroups_train.data\n",
    "X_test = newsgroups_test.data\n",
    "y_train = newsgroups_train.target\n",
    "y_test = newsgroups_test.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# this calculates a vector of term frequencies for \n",
    "# each document\n",
    "vect = CountVectorizer()\n",
    "\n",
    "# this normalizes each term frequency by the \n",
    "# number of documents having that term\n",
    "tfidf = TfidfTransformer()\n",
    "\n",
    "# this is a linear SVM classifier\n",
    "clf = LinearSVC()\n",
    "\n",
    "pipeline = Pipeline([\n",
    "    ('vect',vect),\n",
    "    ('tfidf',tfidf),\n",
    "    ('clf',clf)\n",
    "])\n",
    "\n",
    "scores = cross_val_score(pipeline,X_train,y_train,cv=3,\n",
    "    scoring='f1_micro')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.99162011,  0.98882682,  0.99159664])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.99068118867658794"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now train and predict test instances\n",
    "pipeline.fit(X_train,y_train)\n",
    "y_preds = pipeline.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.97475455820476853"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate f1\n",
    "f1_score(y_test, y_preds, average='micro')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Global TF Kernel (Python 3)",
   "language": "python",
   "name": "global-tf-python-3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
